"""
Phase 4: R&D Efficiency Puzzle
==============================
Aging countries spend MORE on R&D but produce FEWER patents and less high-tech
exports. This script investigates why — testing whether aging shifts R&D
composition toward health/pharma (low patent yield per dollar) and away from
commercial tech/manufacturing.

Channels tested:
  A. Data assembly & proxy construction
  B. R&D efficiency regressions (patents per R&D dollar)
  C. Health expenditure as mediator
  D. Manufacturing channel
  E. Human capital quality
  F. Time dynamics of efficiency decline
  G. Summary of attenuation results
"""

import sys
sys.path.insert(0, '/mnt/c/demographics_capital_flows/multilateral/src')
from model import PanelGLS
import pandas as pd
import numpy as np
from pathlib import Path
import urllib.request
import json
import time

# ── Paths ──────────────────────────────────────────────────────────────────
BASE = Path('/mnt/c/demographics_capital_flows/innovation')
DATA_RAW = BASE / 'data' / 'raw'
DATA_PROC = BASE / 'data' / 'processed'
TABLE_DIR = BASE / 'output' / 'tables'
DATA_RAW.mkdir(parents=True, exist_ok=True)
TABLE_DIR.mkdir(parents=True, exist_ok=True)


# ── Helper functions ───────────────────────────────────────────────────────

def stars(p):
    if p < 0.01: return '***'
    if p < 0.05: return '**'
    if p < 0.10: return '*'
    return ''


def run_gls(df, yvar, xvars, label='', show=True):
    """Run PanelGLS and return results dict. Returns None if insufficient data."""
    cols = [yvar] + xvars + ['iso3', 'year']
    sub = df.dropna(subset=[yvar] + xvars).copy()
    if len(sub) < 60:
        if show:
            print(f"  [{label}] Skipped — only {len(sub)} obs")
        return None
    gls = PanelGLS()
    gls.fit(sub[yvar].values, sub[xvars].values, sub['iso3'].values, sub['year'].values)
    if show:
        sig = {v: stars(p) for v, p in zip(xvars, gls.pvalues)}
        print(f"  [{label}] N={gls.n_obs}, countries={gls.n_countries}, "
              f"R2={gls.r_squared:.4f}, rho={gls.rho:.3f}")
        for v, b, se, p in zip(xvars, gls.beta, gls.se, gls.pvalues):
            print(f"    {v:<30s} {b:>9.4f} ({se:.4f}) {stars(p)}")
    return {
        'label': label, 'yvar': yvar, 'xvars': xvars,
        'beta': dict(zip(xvars, gls.beta)),
        'se': dict(zip(xvars, gls.se)),
        'pvalues': dict(zip(xvars, gls.pvalues)),
        'r_squared': gls.r_squared,
        'n_obs': gls.n_obs, 'n_countries': gls.n_countries,
        'rho': gls.rho,
    }


def results_to_md_table(results_list, title, key_vars=None):
    """Convert list of result dicts to markdown table."""
    if not results_list:
        return f"# {title}\n\nNo results available.\n"
    lines = [f"# {title}\n"]
    for res in results_list:
        if res is None:
            continue
        lines.append(f"## {res['label']}")
        lines.append(f"- Dependent variable: `{res['yvar']}`")
        lines.append(f"- N = {res['n_obs']}, Countries = {res['n_countries']}, "
                      f"R² = {res['r_squared']:.4f}, ρ = {res['rho']:.3f}\n")
        lines.append("| Variable | Coefficient | Std Error | Sig |")
        lines.append("|:---------|:------------|:----------|:----|")
        for v in res['xvars']:
            b = res['beta'][v]
            se = res['se'][v]
            p = res['pvalues'][v]
            lines.append(f"| {v} | {b:.4f} | {se:.4f} | {stars(p)} |")
        lines.append("")
    return "\n".join(lines)


# ── Load data ──────────────────────────────────────────────────────────────
print("=" * 70)
print("PHASE 4: R&D EFFICIENCY PUZZLE")
print("=" * 70)

df = pd.read_csv(DATA_PROC / 'innovation_panel.csv')
df = df[df['year'] <= 2024].copy()
print(f"Panel loaded: {df['iso3'].nunique()} countries, {len(df)} obs, "
      f"{df['year'].min()}-{df['year'].max()}")


# ══════════════════════════════════════════════════════════════════════════
# PART A: Data Assembly & Proxy Construction
# ══════════════════════════════════════════════════════════════════════════
print("\n" + "=" * 70)
print("PART A: Data Assembly & Proxy Construction")
print("=" * 70)

# Try downloading manufacturing VA share from World Bank WDI
wdi_cache = DATA_RAW / 'wdi_manufacturing.csv'
mfg_col = 'manufacturing_va_share'

if wdi_cache.exists():
    print("  Loading cached WDI manufacturing data...")
    mfg_df = pd.read_csv(wdi_cache)
else:
    print("  Downloading WDI manufacturing VA (% GDP) — NV.IND.MANF.ZS ...")
    indicator = 'NV.IND.MANF.ZS'
    url = (f"https://api.worldbank.org/v2/country/all/indicator/{indicator}"
           f"?date=1990:2024&format=json&per_page=20000")
    try:
        req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0'})
        with urllib.request.urlopen(req, timeout=30) as resp:
            raw = json.loads(resp.read().decode())
        if len(raw) >= 2 and raw[1]:
            rows = []
            for obs in raw[1]:
                if obs['value'] is not None:
                    rows.append({
                        'iso3': obs['countryiso3code'],
                        'year': int(obs['date']),
                        mfg_col: float(obs['value']),
                    })
            mfg_df = pd.DataFrame(rows)
            mfg_df = mfg_df[mfg_df['iso3'].str.len() == 3]
            mfg_df.to_csv(wdi_cache, index=False)
            print(f"    Downloaded {len(mfg_df)} obs for {mfg_df['iso3'].nunique()} countries")
        else:
            print("    WDI returned empty; will construct proxy")
            mfg_df = pd.DataFrame(columns=['iso3', 'year', mfg_col])
    except Exception as e:
        print(f"    WDI download failed: {e}")
        mfg_df = pd.DataFrame(columns=['iso3', 'year', mfg_col])

# Merge manufacturing data
if len(mfg_df) > 0 and mfg_col not in df.columns:
    df = df.merge(mfg_df[['iso3', 'year', mfg_col]], on=['iso3', 'year'], how='left')
    print(f"  Merged manufacturing VA: {df[mfg_col].notna().sum()} obs")
elif mfg_col in df.columns:
    print(f"  Manufacturing VA already in panel: {df[mfg_col].notna().sum()} obs")

# Try downloading researchers per million from WDI
researchers_cache = DATA_RAW / 'wdi_researchers.csv'
if researchers_cache.exists():
    print("  Loading cached WDI researchers data...")
    res_df = pd.read_csv(researchers_cache)
else:
    print("  Downloading WDI researchers per million — SP.POP.SCIE.RD.P6 ...")
    indicator = 'SP.POP.SCIE.RD.P6'
    url = (f"https://api.worldbank.org/v2/country/all/indicator/{indicator}"
           f"?date=1990:2024&format=json&per_page=20000")
    try:
        req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0'})
        with urllib.request.urlopen(req, timeout=30) as resp:
            raw = json.loads(resp.read().decode())
        if len(raw) >= 2 and raw[1]:
            rows = []
            for obs in raw[1]:
                if obs['value'] is not None:
                    rows.append({
                        'iso3': obs['countryiso3code'],
                        'year': int(obs['date']),
                        'researchers_per_million': float(obs['value']),
                    })
            res_df = pd.DataFrame(rows)
            res_df = res_df[res_df['iso3'].str.len() == 3]
            res_df.to_csv(researchers_cache, index=False)
            print(f"    Downloaded {len(res_df)} obs for {res_df['iso3'].nunique()} countries")
        else:
            print("    WDI returned empty for researchers")
            res_df = pd.DataFrame(columns=['iso3', 'year', 'researchers_per_million'])
    except Exception as e:
        print(f"    Researchers download failed: {e}")
        res_df = pd.DataFrame(columns=['iso3', 'year', 'researchers_per_million'])

if len(res_df) > 0 and 'researchers_per_million' not in df.columns:
    df = df.merge(res_df[['iso3', 'year', 'researchers_per_million']],
                  on=['iso3', 'year'], how='left')
    print(f"  Merged researchers: {df['researchers_per_million'].notna().sum()} obs")
elif 'researchers_per_million' in df.columns:
    print(f"  Researchers already in panel: {df['researchers_per_million'].notna().sum()} obs")

# ── Construct proxy variables ──────────────────────────────────────────────
print("\n  Constructing proxy variables...")

# Health R&D proxy: interaction of health spending and R&D spending
df['health_rd_proxy'] = df['health_exp_gdp'] * df['rd_expenditure_gdp']
print(f"  health_rd_proxy (health_exp × R&D): {df['health_rd_proxy'].notna().sum()} obs")

# R&D efficiency: log(patents_per_million / rd_expenditure_gdp)
mask = (df['patents_per_million'] > 0) & (df['rd_expenditure_gdp'] > 0)
df['rd_efficiency'] = np.nan
df.loc[mask, 'rd_efficiency'] = np.log(
    df.loc[mask, 'patents_per_million'] / df.loc[mask, 'rd_expenditure_gdp']
)
print(f"  rd_efficiency: {df['rd_efficiency'].notna().sum()} obs")

# Patents per researcher (if available)
if 'researchers_per_million' in df.columns:
    mask_r = (df['patents_per_million'] > 0) & (df['researchers_per_million'] > 0)
    df['patents_per_researcher'] = np.nan
    df.loc[mask_r, 'patents_per_researcher'] = (
        df.loc[mask_r, 'patents_per_million'] / df.loc[mask_r, 'researchers_per_million']
    )
    print(f"  patents_per_researcher: {df['patents_per_researcher'].notna().sum()} obs")

# log GDP per capita
df['log_gdp_pc'] = np.log(df['gdp_pc_ppp'].clip(lower=100))
print(f"  log_gdp_pc: {df['log_gdp_pc'].notna().sum()} obs")

# OECD dummy already exists
print(f"  oecd dummy: {df['oecd'].sum()} obs (OECD=1)")

# Income terciles
df['income_tercile'] = pd.qcut(
    df.groupby('year')['gdp_pc_ppp'].transform('rank', pct=True),
    3, labels=['Low', 'Mid', 'High']
)

# Save data summary
data_lines = [
    "# Efficiency Puzzle — Data Summary\n",
    "## Variables constructed\n",
    "| Variable | Definition | N obs |",
    "|:---------|:-----------|:------|",
    f"| rd_efficiency | log(patents_per_million / rd_expenditure_gdp) | {df['rd_efficiency'].notna().sum()} |",
    f"| health_rd_proxy | health_exp_gdp × rd_expenditure_gdp | {df['health_rd_proxy'].notna().sum()} |",
    f"| log_gdp_pc | log(gdp_pc_ppp) | {df['log_gdp_pc'].notna().sum()} |",
    f"| manufacturing_va_share | WDI NV.IND.MANF.ZS | {df[mfg_col].notna().sum() if mfg_col in df.columns else 0} |",
]
if 'researchers_per_million' in df.columns:
    data_lines.append(
        f"| researchers_per_million | WDI SP.POP.SCIE.RD.P6 | {df['researchers_per_million'].notna().sum()} |"
    )
    data_lines.append(
        f"| patents_per_researcher | patents_per_million / researchers_per_million | {df['patents_per_researcher'].notna().sum() if 'patents_per_researcher' in df.columns else 0} |"
    )
data_lines.append("")
(TABLE_DIR / 'efficiency_data.md').write_text("\n".join(data_lines))
print(f"\n  Saved: {TABLE_DIR / 'efficiency_data.md'}")


# ── Standard controls ─────────────────────────────────────────────────────
demo_vars = ['Z_1', 'Z_2', 'Z_3']
controls = ['rgdp_growth', 'kaopen', 'log_gdp_pc']


# ══════════════════════════════════════════════════════════════════════════
# PART B: R&D Efficiency Analysis
# ══════════════════════════════════════════════════════════════════════════
print("\n" + "=" * 70)
print("PART B: R&D Efficiency Analysis")
print("=" * 70)

results_b = []

# B1: Z → rd_efficiency (full sample)
r = run_gls(df, 'rd_efficiency', demo_vars + controls, label='B1: Z → rd_efficiency (full)')
results_b.append(r)

# B2: old_dep → rd_efficiency
r = run_gls(df, 'rd_efficiency', ['old_dep'] + controls, label='B2: old_dep → rd_efficiency')
results_b.append(r)

# B3: OECD only
r = run_gls(df[df['oecd'] == 1], 'rd_efficiency', demo_vars + controls,
            label='B3: Z → rd_efficiency (OECD)')
results_b.append(r)

# B4: Non-OECD
r = run_gls(df[df['oecd'] == 0], 'rd_efficiency', demo_vars + controls,
            label='B4: Z → rd_efficiency (non-OECD)')
results_b.append(r)

# B5-B7: Income tercile splits
for terc in ['Low', 'Mid', 'High']:
    sub = df[df['income_tercile'] == terc]
    r = run_gls(sub, 'rd_efficiency', demo_vars + controls,
                label=f'B5-7: Z → rd_efficiency ({terc} income)')
    results_b.append(r)

# B8: Researchers per R&D dollar (if available)
if 'patents_per_researcher' in df.columns:
    df['log_patents_per_researcher'] = np.log(df['patents_per_researcher'].clip(lower=0.001))
    r = run_gls(df, 'log_patents_per_researcher', demo_vars + controls,
                label='B8: Z → log(patents/researcher)')
    results_b.append(r)

md_b = results_to_md_table(results_b, "R&D Efficiency Regressions")
(TABLE_DIR / 'efficiency_regressions.md').write_text(md_b)
print(f"\n  Saved: {TABLE_DIR / 'efficiency_regressions.md'}")


# ══════════════════════════════════════════════════════════════════════════
# PART C: Health Expenditure as Mediator
# ══════════════════════════════════════════════════════════════════════════
print("\n" + "=" * 70)
print("PART C: Health Expenditure as Mediator")
print("=" * 70)

results_c = []

# C1: Z → health_exp_gdp
r = run_gls(df, 'health_exp_gdp', demo_vars + controls,
            label='C1: Z → health_exp_gdp')
results_c.append(r)

# C2: health_exp_gdp → patents_per_million
r = run_gls(df, 'patents_per_million', ['health_exp_gdp'] + controls,
            label='C2: health_exp → patents_per_million')
results_c.append(r)

# C3: Z → patents_per_million WITHOUT health control
r = run_gls(df, 'patents_per_million', demo_vars + controls,
            label='C3: Z → patents_pm (no health control)')
results_c.append(r)

# C4: Z → patents_per_million WITH health control (attenuation test)
r = run_gls(df, 'patents_per_million', demo_vars + ['health_exp_gdp'] + controls,
            label='C4: Z → patents_pm (with health control)')
results_c.append(r)

# C5: Z → hightech_exports_share WITHOUT health control
r = run_gls(df, 'hightech_exports_share', demo_vars + controls,
            label='C5: Z → hightech_exports (no health)')
results_c.append(r)

# C6: Z → hightech_exports_share WITH health control
r = run_gls(df, 'hightech_exports_share', demo_vars + ['health_exp_gdp'] + controls,
            label='C6: Z → hightech_exports (with health)')
results_c.append(r)

# C7: Z → rd_efficiency WITH health control
r = run_gls(df, 'rd_efficiency', demo_vars + ['health_exp_gdp'] + controls,
            label='C7: Z → rd_efficiency (with health)')
results_c.append(r)

# C8: health_rd_proxy → patents (interaction term)
r = run_gls(df, 'patents_per_million', demo_vars + ['health_rd_proxy'] + controls,
            label='C8: Z + health_rd_proxy → patents_pm')
results_c.append(r)

md_c = results_to_md_table(results_c, "Health Expenditure as Mediator")
(TABLE_DIR / 'efficiency_health_mediation.md').write_text(md_c)
print(f"\n  Saved: {TABLE_DIR / 'efficiency_health_mediation.md'}")


# ══════════════════════════════════════════════════════════════════════════
# PART D: Manufacturing Channel
# ══════════════════════════════════════════════════════════════════════════
print("\n" + "=" * 70)
print("PART D: Manufacturing Channel")
print("=" * 70)

results_d = []

if mfg_col in df.columns and df[mfg_col].notna().sum() > 100:
    # D1: Z → manufacturing_va_share
    r = run_gls(df, mfg_col, demo_vars + controls,
                label='D1: Z → manufacturing VA share')
    results_d.append(r)

    # D2: manufacturing → patents
    r = run_gls(df, 'patents_per_million', [mfg_col] + controls,
                label='D2: manufacturing → patents_pm')
    results_d.append(r)

    # D3: Z → patents WITHOUT manufacturing
    r = run_gls(df, 'patents_per_million', demo_vars + controls,
                label='D3: Z → patents_pm (no mfg control)')
    results_d.append(r)

    # D4: Z → patents WITH manufacturing (attenuation test)
    r = run_gls(df, 'patents_per_million', demo_vars + [mfg_col] + controls,
                label='D4: Z → patents_pm (with mfg control)')
    results_d.append(r)

    # D5: Z → rd_efficiency WITH manufacturing
    r = run_gls(df, 'rd_efficiency', demo_vars + [mfg_col] + controls,
                label='D5: Z → rd_efficiency (with mfg)')
    results_d.append(r)

    # D6: Z → hightech WITH manufacturing
    r = run_gls(df, 'hightech_exports_share', demo_vars + [mfg_col] + controls,
                label='D6: Z → hightech (with mfg)')
    results_d.append(r)
else:
    print("  Manufacturing VA share not available — skipping Part D")
    # Use trade_openness as partial proxy for economic structure
    if df['trade_openness'].notna().sum() > 200:
        r = run_gls(df, 'patents_per_million', demo_vars + ['trade_openness'] + controls,
                    label='D-alt: Z → patents_pm (with trade openness)')
        results_d.append(r)

md_d = results_to_md_table(results_d, "Manufacturing Channel")
(TABLE_DIR / 'efficiency_manufacturing.md').write_text(md_d)
print(f"\n  Saved: {TABLE_DIR / 'efficiency_manufacturing.md'}")


# ══════════════════════════════════════════════════════════════════════════
# PART E: Human Capital Quality
# ══════════════════════════════════════════════════════════════════════════
print("\n" + "=" * 70)
print("PART E: Human Capital Quality")
print("=" * 70)

results_e = []

# E1: Z → human_capital
r = run_gls(df, 'human_capital', demo_vars + controls,
            label='E1: Z → human_capital')
results_e.append(r)

# E2: human_capital → rd_efficiency
r = run_gls(df, 'rd_efficiency', ['human_capital'] + controls,
            label='E2: human_capital → rd_efficiency')
results_e.append(r)

# E3: Z → patents WITHOUT human capital
r = run_gls(df, 'patents_per_million', demo_vars + controls,
            label='E3: Z → patents_pm (no HC)')
results_e.append(r)

# E4: Z → patents WITH human capital
r = run_gls(df, 'patents_per_million', demo_vars + ['human_capital'] + controls,
            label='E4: Z → patents_pm (with HC)')
results_e.append(r)

# E5: Interaction Z_1 × human_capital
df['Z1_x_hc'] = df['Z_1'] * df['human_capital']
r = run_gls(df, 'patents_per_million',
            demo_vars + ['human_capital', 'Z1_x_hc'] + controls,
            label='E5: Z + Z1×HC → patents_pm')
results_e.append(r)

# E6: Z → rd_efficiency WITH human capital
r = run_gls(df, 'rd_efficiency', demo_vars + ['human_capital'] + controls,
            label='E6: Z → rd_efficiency (with HC)')
results_e.append(r)

# E7: Interaction Z_1 × human_capital → rd_efficiency
r = run_gls(df, 'rd_efficiency',
            demo_vars + ['human_capital', 'Z1_x_hc'] + controls,
            label='E7: Z + Z1×HC → rd_efficiency')
results_e.append(r)

md_e = results_to_md_table(results_e, "Human Capital Quality Channel")
(TABLE_DIR / 'efficiency_human_capital.md').write_text(md_e)
print(f"\n  Saved: {TABLE_DIR / 'efficiency_human_capital.md'}")


# ══════════════════════════════════════════════════════════════════════════
# PART F: Time Dynamics of the Efficiency Decline
# ══════════════════════════════════════════════════════════════════════════
print("\n" + "=" * 70)
print("PART F: Time Dynamics")
print("=" * 70)

results_f = []

# F1: Pre-2000
r = run_gls(df[df['year'] < 2000], 'rd_efficiency', demo_vars + controls,
            label='F1: Z → rd_efficiency (pre-2000)')
results_f.append(r)

# F2: Post-2000
r = run_gls(df[df['year'] >= 2000], 'rd_efficiency', demo_vars + controls,
            label='F2: Z → rd_efficiency (post-2000)')
results_f.append(r)

# F3: Pre-2010
r = run_gls(df[df['year'] < 2010], 'rd_efficiency', demo_vars + controls,
            label='F3: Z → rd_efficiency (pre-2010)')
results_f.append(r)

# F4: Post-2010
r = run_gls(df[df['year'] >= 2010], 'rd_efficiency', demo_vars + controls,
            label='F4: Z → rd_efficiency (post-2010)')
results_f.append(r)

# F5: Same splits for patents_per_million
r = run_gls(df[df['year'] < 2000], 'patents_per_million', demo_vars + controls,
            label='F5: Z → patents_pm (pre-2000)')
results_f.append(r)

r = run_gls(df[df['year'] >= 2000], 'patents_per_million', demo_vars + controls,
            label='F6: Z → patents_pm (post-2000)')
results_f.append(r)

# F7-F8: Time trend interaction
df['post2000'] = (df['year'] >= 2000).astype(float)
df['Z1_x_post2000'] = df['Z_1'] * df['post2000']
r = run_gls(df, 'rd_efficiency',
            demo_vars + ['post2000', 'Z1_x_post2000'] + controls,
            label='F7: Z + Z1×post2000 → rd_efficiency')
results_f.append(r)

df['post2010'] = (df['year'] >= 2010).astype(float)
df['Z1_x_post2010'] = df['Z_1'] * df['post2010']
r = run_gls(df, 'rd_efficiency',
            demo_vars + ['post2010', 'Z1_x_post2010'] + controls,
            label='F8: Z + Z1×post2010 → rd_efficiency')
results_f.append(r)

md_f = results_to_md_table(results_f, "Time Dynamics of Efficiency Decline")
(TABLE_DIR / 'efficiency_time.md').write_text(md_f)
print(f"\n  Saved: {TABLE_DIR / 'efficiency_time.md'}")


# ══════════════════════════════════════════════════════════════════════════
# PART G: Summary — Attenuation Analysis
# ══════════════════════════════════════════════════════════════════════════
print("\n" + "=" * 70)
print("PART G: Attenuation Summary")
print("=" * 70)

# Collect Z_1 coefficients across specifications to measure attenuation
summary_rows = []

def add_summary(label, res, baseline_z1_coef=None):
    if res is None:
        return
    z1_b = res['beta'].get('Z_1', np.nan)
    z1_p = res['pvalues'].get('Z_1', np.nan)
    attenuation = np.nan
    if baseline_z1_coef is not None and baseline_z1_coef != 0 and not np.isnan(z1_b):
        attenuation = (1 - z1_b / baseline_z1_coef) * 100
    summary_rows.append({
        'Specification': label,
        'DV': res['yvar'],
        'Z_1 coef': z1_b,
        'Z_1 p-val': z1_p,
        'Attenuation %': attenuation,
        'R²': res['r_squared'],
        'N': res['n_obs'],
    })


# Baseline Z_1 on patents_per_million (from C3)
baseline_patents = [r for r in results_c if r and 'no health' in r['label'] and 'patents_pm' in r['label']]
baseline_z1_patents = baseline_patents[0]['beta'].get('Z_1', 0) if baseline_patents else None

# Baseline Z_1 on rd_efficiency (from B1)
baseline_eff = results_b[0] if results_b and results_b[0] else None
baseline_z1_eff = baseline_eff['beta'].get('Z_1', 0) if baseline_eff else None

add_summary('Baseline: Z → patents_pm', baseline_patents[0] if baseline_patents else None)
add_summary('Baseline: Z → rd_efficiency', baseline_eff)

# Health mediation on patents
health_patents = [r for r in results_c if r and 'with health' in r['label'] and 'patents_pm' in r['label']]
if health_patents:
    add_summary('+ Health expenditure → patents_pm', health_patents[0], baseline_z1_patents)

# Health mediation on rd_efficiency
health_eff = [r for r in results_c if r and 'with health' in r['label'] and 'rd_efficiency' in r['label']]
if health_eff:
    add_summary('+ Health expenditure → rd_efficiency', health_eff[0], baseline_z1_eff)

# Manufacturing mediation on patents
mfg_patents = [r for r in results_d if r and 'with mfg' in r['label'] and 'patents_pm' in r['label']]
if mfg_patents:
    add_summary('+ Manufacturing VA → patents_pm', mfg_patents[0], baseline_z1_patents)

# Manufacturing mediation on rd_efficiency
mfg_eff = [r for r in results_d if r and 'with mfg' in r['label'] and 'rd_efficiency' in r['label']]
if mfg_eff:
    add_summary('+ Manufacturing VA → rd_efficiency', mfg_eff[0], baseline_z1_eff)

# Human capital mediation on patents
hc_patents = [r for r in results_e if r and 'with HC' in r['label'] and 'patents_pm' in r['label']]
if hc_patents:
    add_summary('+ Human capital → patents_pm', hc_patents[0], baseline_z1_patents)

# Human capital mediation on rd_efficiency
hc_eff = [r for r in results_e if r and 'with HC' in r['label'] and 'rd_efficiency' in r['label']]
if hc_eff:
    add_summary('+ Human capital → rd_efficiency', hc_eff[0], baseline_z1_eff)

# Print summary
if summary_rows:
    sdf = pd.DataFrame(summary_rows)
    print("\n  Attenuation of Z_1 coefficient across specifications:")
    print(sdf.to_string(index=False, float_format=lambda x: f'{x:.4f}' if abs(x) < 100 else f'{x:.1f}'))

    # Write markdown
    lines = ["# Efficiency Puzzle — Summary of Attenuation Results\n"]
    lines.append("How much does each channel explain the aging → innovation disconnect?\n")
    lines.append("| Specification | DV | Z_1 coef | Z_1 sig | Attenuation % | R² | N |")
    lines.append("|:-------------|:---|:---------|:--------|:--------------|:---|:--|")
    for _, row in sdf.iterrows():
        z1_s = stars(row['Z_1 p-val']) if not np.isnan(row['Z_1 p-val']) else ''
        att = f"{row['Attenuation %']:.1f}%" if not np.isnan(row['Attenuation %']) else '—'
        lines.append(
            f"| {row['Specification']} | {row['DV']} | {row['Z_1 coef']:.4f} | "
            f"{z1_s} | {att} | {row['R²']:.4f} | {row['N']} |"
        )
    lines.append("\n## Interpretation\n")
    lines.append("Attenuation % measures how much the Z_1 coefficient shrinks when the channel ")
    lines.append("variable is added as a control. Higher attenuation = more of the demographic ")
    lines.append("effect on innovation is mediated through that channel.\n")
    (TABLE_DIR / 'efficiency_summary.md').write_text("\n".join(lines))
    print(f"\n  Saved: {TABLE_DIR / 'efficiency_summary.md'}")
else:
    print("  No summary rows to report")

print("\n" + "=" * 70)
print("PHASE 4 COMPLETE")
print("=" * 70)
